602 Project

First of all, we should load the data.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:scales':
## 
##     rescale
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(grid)
file = 'all-ages.csv'
df_all <- read.csv(file)
head(df_all, 10)
##    index Major_code                                 Major
## 1      0       1100                   GENERAL AGRICULTURE
## 2      1       1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3      2       1102                AGRICULTURAL ECONOMICS
## 4      3       1103                       ANIMAL SCIENCES
## 5      4       1104                          FOOD SCIENCE
## 6      5       1105            PLANT SCIENCE AND AGRONOMY
## 7      6       1106                          SOIL SCIENCE
## 8      7       1199             MISCELLANEOUS AGRICULTURE
## 9      8       1301                 ENVIRONMENTAL SCIENCE
## 10     9       1302                              FORESTRY
##                     Major_category  Total Employed
## 1  Agriculture & Natural Resources 128148    90245
## 2  Agriculture & Natural Resources  95326    76865
## 3  Agriculture & Natural Resources  33955    26321
## 4  Agriculture & Natural Resources 103549    81177
## 5  Agriculture & Natural Resources  24280    17281
## 6  Agriculture & Natural Resources  79409    63043
## 7  Agriculture & Natural Resources   6586     4926
## 8  Agriculture & Natural Resources   8549     6392
## 9           Biology & Life Science 106106    87602
## 10 Agriculture & Natural Resources  69447    48228
##    Employed_full_time_year_round Unemployed Unemployment_rate Median P25th
## 1                          74078       2423        0.02614711  50000 34000
## 2                          64240       2266        0.02863606  54000 36000
## 3                          22810        821        0.03024832  63000 40000
## 4                          64937       3619        0.04267890  46000 30000
## 5                          12722        894        0.04918845  62000 38500
## 6                          51077       2070        0.03179089  50000 35000
## 7                           4042        264        0.05086705  63000 39400
## 8                           5074        261        0.03923042  52000 35000
## 9                          65238       4736        0.05128983  52000 38000
## 10                         39613       2144        0.04256333  58000 40500
##    P75th
## 1  80000
## 2  80000
## 3  98000
## 4  72000
## 5  90000
## 6  75000
## 7  88000
## 8  75000
## 9  75000
## 10 80000
file = 'recent-grads.csv'
df_women <- read.csv(file)
head(df_women, 10)
##    index Rank Major_code                                     Major
## 1      0    1       2419                     PETROLEUM ENGINEERING
## 2      1    2       2416            MINING AND MINERAL ENGINEERING
## 3      2    3       2415                 METALLURGICAL ENGINEERING
## 4      3    4       2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING
## 5      4    5       2405                      CHEMICAL ENGINEERING
## 6      5    6       2418                       NUCLEAR ENGINEERING
## 7      6    7       6202                         ACTUARIAL SCIENCE
## 8      7    8       5001                ASTRONOMY AND ASTROPHYSICS
## 9      8    9       2414                    MECHANICAL ENGINEERING
## 10     9   10       2408                    ELECTRICAL ENGINEERING
##       Major_category Total Sample_size   Men Women ShareWomen Employed
## 1        Engineering  2339          36  2057   282  0.1205643     1976
## 2        Engineering   756           7   679    77  0.1018519      640
## 3        Engineering   856           3   725   131  0.1530374      648
## 4        Engineering  1258          16  1123   135  0.1073132      758
## 5        Engineering 32260         289 21239 11021  0.3416305    25694
## 6        Engineering  2573          17  2200   373  0.1449670     1857
## 7           Business  3777          51   832   960  0.5357143     2912
## 8  Physical Sciences  1792          10  2110  1667  0.4413556     1526
## 9        Engineering 91227        1029 12953  2105  0.1397928    76442
## 10       Engineering 81527         631  8407  6548  0.4378469    61928
##    Full_time Part_time Full_time_year_round Unemployed Unemployment_rate Median
## 1       1849       270                 1207         37        0.01838053 110000
## 2        556       170                  388         85        0.11724138  75000
## 3        558       133                  340         16        0.02409639  73000
## 4       1069       150                  692         40        0.05012531  70000
## 5      23170      5180                16697       1672        0.06109771  65000
## 6       2038       264                 1449        400        0.17722641  65000
## 7       2924       296                 2482        308        0.09565217  62000
## 8       1085       553                  827         33        0.02116741  62000
## 9      71298     13101                54639       4650        0.05734228  60000
## 10     55450     12695                41413       3895        0.05917385  60000
##    P25th  P75th College_jobs Non_college_jobs Low_wage_jobs
## 1  95000 125000         1534              364           193
## 2  55000  90000          350              257            50
## 3  50000 105000          456              176             0
## 4  43000  80000          529              102             0
## 5  50000  75000        18314             4440           972
## 6  50000 102000         1142              657           244
## 7  53000  72000         1768              314           259
## 8  31500 109000          972              500           220
## 9  48000  70000        52844            16384          3253
## 10 45000  72000        45829            10874          3170

In our project, we will explore 3 main questions that will guide us through analysis. These questions will explore the unemployment rate, proportion of males vs. females, and the salary within each major category. Our guiding questions are as below: Is there a relationship between major categories and employment (full-time year-round)? 1. Is there a relationship between major categories and the unemployment rate?

Creating 16 lists of dataframes for each major category

major_categories <- unique(df_all$Major_category)
major_list <- list()
nsize <- list()
i = 1
for (category in major_categories){
  major_list[[i]] <- df_all[df_all$Major_category == category,]
  nsize[i] <- nrow(major_list[[i]])
  i = i + 1
}

To make a conclusion about the difference of salaries between different majors the next hypothesis is done:

H0: There is significant difference between in the unemployment rate between major categories HA: There is no significant difference between in the unemployment rate between major categories

For checking the hypothesis it can be compared each major median salary with median salary of all majors. To compare these variables bootstrap can be used:

nsims = 2000
df_means <- data.frame(matrix(ncol = 2, nrow = 0))
df_bootstrap <- data.frame(matrix(ncol = 4, nrow = 0))
df_bootstrap_diff <- data.frame(matrix(ncol = 3, nrow = 0))
df_all_means <- data.frame(matrix(ncol = 1, nrow = 0))
df_diff <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(df_bootstrap) <- c('Major_category', 'Major_mean', 'Confidence_Interval_Left', 'Confidence_Interval_Right')
colnames(df_bootstrap_diff) <- c('Major_category', 'Diff_CI_Left', 'Diff_CI_Right')
colnames(df_means) <- c('Unemployment_rate', 'Major_category')
colnames(df_diff) <- c('Major_category', 'Diff')
colnames(df_all_means) <- c('Mean')
means_unemployment_rate <- list()
nsize_all = nrow(df_all)
temp_all_mean <- rep(0, nsims)
for (i in (1:nsims)) {
      temp_all_mean[i] = mean(sample(df_all$Unemployment_rate, nsize_all, replace = TRUE)) 
}
df_all_means <- data.frame(temp_all_mean)
#df_all_means
values_diff <- numeric(0)
groups <- numeric(0)
for (j in c(1:length(major_categories))){
    temp_mean <- rep(0, nsims)
    temp_diff <- rep(0, nsims)
    category = major_categories[j]
    for (i in (1:nsims)) {
      temp_mean[i] = mean(sample(df_all[df_all$Major_category == category, ]$Unemployment_rate, nsize[[j]], replace = TRUE))
      temp_diff[i] = temp_mean[i] - temp_all_mean[i]
    }
   df_means <- c(category, data.frame(temp_mean))
   values_diff <- c(values_diff, temp_diff)
   groups <- c(groups, rep(category, nsims))
   conf_int <- qdata(temp_mean, c(0.025, 0.975), data=df_means)
   conf_int_diff <- qdata(temp_diff, c(0.025, 0.975), data.frame(temp_diff))
   real_mean <- mean(temp_mean)
   df_bootstrap[nrow(df_bootstrap) + 1, ] <- c(category, real_mean, conf_int[[1]], conf_int[[2]])
   df_bootstrap_diff[nrow(df_bootstrap_diff) + 1, ] <- c(category, conf_int_diff[[1]], conf_int_diff[[2]])
}
df_diff <- data.frame(values_diff, groups)
print('Bootstrap confidence intervals for the unemployment rate')
## [1] "Bootstrap confidence intervals for the unemployment rate"
df_bootstrap
##                         Major_category         Major_mean
## 1      Agriculture & Natural Resources   0.03949344086275
## 2               Biology & Life Science 0.0497187284341429
## 3                          Engineering  0.050599190474569
## 4            Humanities & Liberal Arts    0.0694564530815
## 5          Communications & Journalism  0.068903905683875
## 6              Computers & Mathematics 0.0595238330855909
## 7  Industrial Arts & Consumer Services 0.0584748503184286
## 8                            Education 0.0470195106612813
## 9                  Law & Public Policy     0.067753005361
## 10                   Interdisciplinary        0.077268968
## 11                              Health 0.0470756275661667
## 12                      Social Science 0.0657015280706111
## 13                   Physical Sciences   0.05450217519735
## 14            Psychology & Social Work 0.0778894293118889
## 15                                Arts    0.0879335844365
## 16                            Business 0.0545357439965769
##    Confidence_Interval_Left Confidence_Interval_Right
## 1           0.0332209149625           0.0455277790725
## 2        0.0423358285482143           0.0559573310875
## 3        0.0444350118827586        0.0558285690689655
## 4            0.064404804135            0.073546360035
## 5               0.063138533              0.0783436485
## 6        0.0489162128090909        0.0703780701727273
## 7        0.0495164325678571        0.0734591865892857
## 8            0.036527613275        0.0591605765421875
## 9             0.06032093337              0.0748022276
## 10              0.077268968               0.077268968
## 11         0.03862825954375             0.05514336415
## 12       0.0622989213722222        0.0686193713555556
## 13            0.04622248129           0.0635189858925
## 14       0.0706876521361111        0.0858323102166667
## 15        0.071119572046875         0.109764090271875
## 16       0.0507073998461538        0.0584605831865385
#png("bootstrap_unemployment.png", height = 50*nrow(df_bootstrap), width = 200*ncol(df_bootstrap))
#grid.table(df_bootstrap)
#dev.off()
p1 <- ggplot(df_diff[df_diff$groups == major_categories[1], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#00BFC4", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[1]) + theme(text = element_text(size = 12))  
p2 <- ggplot(df_diff[df_diff$groups == major_categories[2], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#619CFF", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[2])+ theme(text = element_text(size = 12))  
p3 <- ggplot(df_diff[df_diff$groups == major_categories[3], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#C77CFF", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[3])+ theme(text = element_text(size = 12))  
p4 <- ggplot(df_diff[df_diff$groups == major_categories[4], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#E76BF3", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[4])+ theme(text = element_text(size = 12))  
p5 <- ggplot(df_diff[df_diff$groups == major_categories[5], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "yellow", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[5])+ theme(text = element_text(size = 12))  
p6 <- ggplot(df_diff[df_diff$groups == major_categories[6], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "blue", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[6])+ theme(text = element_text(size = 12))  
p7 <- ggplot(df_diff[df_diff$groups == major_categories[7], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "red", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[7])+ theme(text = element_text(size = 12))  
p8 <- ggplot(df_diff[df_diff$groups == major_categories[8], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "pink", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[8])+ theme(text = element_text(size = 12))  
p9 <- ggplot(df_diff[df_diff$groups == major_categories[9], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "green", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[9])+ theme(text = element_text(size = 12))  
p10 <- ggplot(df_diff[df_diff$groups == major_categories[10], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "purple", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[10])+ theme(text = element_text(size = 12))  
p11 <- ggplot(df_diff[df_diff$groups == major_categories[11], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "black", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[11])+ theme(text = element_text(size = 12))  
p12 <- ggplot(df_diff[df_diff$groups == major_categories[12], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "darkgrey", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[12])+ theme(text = element_text(size = 12))  
p13 <- ggplot(df_diff[df_diff$groups == major_categories[13], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "darkgreen", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[13])+ theme(text = element_text(size = 12))  
p14 <- ggplot(df_diff[df_diff$groups == major_categories[14], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "magenta", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[14])+ theme(text = element_text(size = 12))  
p15 <- ggplot(df_diff[df_diff$groups == major_categories[15], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "cyan", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[15])+ theme(text = element_text(size = 12))  
p16 <- ggplot(df_diff[df_diff$groups == major_categories[16], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "brown", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[16])+ theme(text = element_text(size = 12))  
p <- grid.arrange(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 4)

g <- arrangeGrob(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 8)
ggsave(file = "diff1.png", g)
## Saving 20 x 6 in image
#ggplot(df_diff, aes(x = values_diff, fill = groups)) + geom_histogram(position = "identity", alpha = 0.2, bins = 50) 
#df_diff
#values_diff

Bootstrap confidence intevals for differences of the mean unemployment rate and unemployment rate by category

pvalue = numeric(16)
for (j in c(1:length(major_categories))){
  category = major_categories[j]
  pvalue[j] = t.test(df_diff[df_diff$groups == category,]$values_diff, mu=0, alternative="two.sided", data=df_diff)[[3]]
  print(pvalue[j])
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 2.474286e-59
## [1] 4.669145e-13
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 5.301603e-137
## [1] 0
## [1] 0
## [1] 0
df_bootstrap_diff$`P-value` <- pvalue
head(df_bootstrap_diff, 16)
##                         Major_category         Diff_CI_Left
## 1      Agriculture & Natural Resources  -0.0247428359787139
## 2               Biology & Life Science  -0.0155771557415359
## 3                          Engineering  -0.0138152449754036
## 4            Humanities & Liberal Arts  0.00629042532638729
## 5          Communications & Journalism  0.00435606497987716
## 6              Computers & Mathematics -0.00871314181171833
## 7  Industrial Arts & Consumer Services -0.00881552828627168
## 8                            Education  -0.0211335155923591
## 9                  Law & Public Policy  0.00264086311083815
## 10                   Interdisciplinary   0.0168907524972543
## 11                              Health   -0.019350352129263
## 12                      Social Science  0.00398009678219333
## 13                   Physical Sciences   -0.011700411911474
## 14            Psychology & Social Work    0.012435299589483
## 15                                Arts   0.0136421108909682
## 16                            Business -0.00764476394614273
##            Diff_CI_Right       P-value
## 1     -0.011121408839552  0.000000e+00
## 2  -0.000604010618806789  0.000000e+00
## 3  -0.000711154687108837  0.000000e+00
## 4     0.0170979134726686  0.000000e+00
## 5     0.0208008589722543  0.000000e+00
## 6     0.0137591215350762  2.474286e-59
## 7     0.0164299433625516  4.669145e-13
## 8    0.00175790144691112  0.000000e+00
## 9      0.017867069058526  0.000000e+00
## 10    0.0226793521432081  0.000000e+00
## 11   -0.0013407030026975  0.000000e+00
## 12    0.0124613264399326  0.000000e+00
## 13   0.00652923863699421 5.301603e-137
## 14    0.0290498582691394  0.000000e+00
## 15    0.0525007208970556  0.000000e+00
## 16   0.00215556920401289  0.000000e+00
#png("bootstrap_unemployment_diff.png", height = 50*nrow(df_bootstrap_diff), width = 200*ncol(df_bootstrap_diff))
#grid.table(df_bootstrap_diff)
#dev.off()

As we can see, in most of major_categories the population mean does not hit into the confidence interval. We can conclude that our null hypothesis is not rejected. All major categories has their own unemployment rates.

To find if we can confidence to these data and understand the type of distributions we can create Normal Probability Plots for each difference distribution

p1 <- ggplot(df_diff[df_diff$groups == major_categories[1], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[1]) + theme(text = element_text(size = 20))
p2 <- ggplot(df_diff[df_diff$groups == major_categories[2], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[2]) + theme(text = element_text(size = 20)) 
p3 <- ggplot(df_diff[df_diff$groups == major_categories[3], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[3]) + theme(text = element_text(size = 20))
p4 <- ggplot(df_diff[df_diff$groups == major_categories[4], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[4]) + theme(text = element_text(size = 20))
p5 <- ggplot(df_diff[df_diff$groups == major_categories[5], ],aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[5]) + theme(text = element_text(size = 20))  
p6 <- ggplot(df_diff[df_diff$groups == major_categories[6], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[6]) + theme(text = element_text(size = 20))
p7 <- ggplot(df_diff[df_diff$groups == major_categories[7], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[7]) + theme(text = element_text(size = 20))  
p8 <- ggplot(df_diff[df_diff$groups == major_categories[8], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[8]) + theme(text = element_text(size = 20))  
p9 <- ggplot(df_diff[df_diff$groups == major_categories[9], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[9]) + theme(text = element_text(size = 20))  
p10 <- ggplot(df_diff[df_diff$groups == major_categories[10], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[10]) + theme(text = element_text(size = 20))  
p11 <- ggplot(df_diff[df_diff$groups == major_categories[11], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[11]) + theme(text = element_text(size = 20))
p12 <- ggplot(df_diff[df_diff$groups == major_categories[12], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[12]) + theme(text = element_text(size = 20)) 
p13 <- ggplot(df_diff[df_diff$groups == major_categories[13], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[13]) + theme(text = element_text(size = 20)) 
p14 <- ggplot(df_diff[df_diff$groups == major_categories[14], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[14]) + theme(text = element_text(size = 20)) 
p15 <- ggplot(df_diff[df_diff$groups == major_categories[15], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[15]) + theme(text = element_text(size = 20))  
p16 <- ggplot(df_diff[df_diff$groups == major_categories[16], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[16]) + theme(text = element_text(size = 20))  
grid.arrange(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 4,
             top = textGrob("Normal Probability Plots",gp=gpar(fontsize=26,font=1)))

g <- arrangeGrob(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 8)
ggsave(file = "qqplot1.png", g)
## Saving 20 x 10 in image

To illustrate the variance of unemployment rate by major category the number of boxplots is illustrated bellow:

ggplot(data=df_all, aes(x = Unemployment_rate, y=Major_category, color = Major_category)) + geom_boxplot() + theme(text = element_text(size = 20)) #+stat_summary(fun.data = mean_cl_boot, geom = "errorbar",  width=0.5, linewidth=1)

#ggsave('unemployment_rate.png')

To make a conclusion about the difference of salaries between different majors the next hypothesis is done:

H0: There is significant difference between in the median salary between major categories HA: There is no significant difference between in the median salary between major categories

For checking the hypothesis it can be compared each major median salary with median salary of all majors. To compare these variables bootstrap can be used:

nsims = 2000
df_means <- data.frame(matrix(ncol = 2, nrow = 0))
df_bootstrap <- data.frame(matrix(ncol = 4, nrow = 0))
df_bootstrap_diff <- data.frame(matrix(ncol = 3, nrow = 0))
df_all_means <- data.frame(matrix(ncol = 1, nrow = 0))
df_diff <- data.frame(matrix(ncol = 2, nrow = 0))

colnames(df_bootstrap) <- c('Major_category', 'Mean', 'Confidence Interval Left', 'Confidence Interval Right')
colnames(df_means) <- c('Median_salary', 'Major_category')
colnames(df_bootstrap_diff) <- c('Major_category', 'Diff_CI_Left', 'Diff_CI_Right')
colnames(df_diff) <- c('Major_category', 'Diff')
colnames(df_all_means) <- c('Mean')
means_salary_rate <- list()
nsize_all = nrow(df_all)

means_salary_rate <- list()
temp_all_mean <- rep(0, nsims)
for (i in (1:nsims)) {
      temp_all_mean[i] = mean(sample(df_all$Median, nsize_all, replace = TRUE)) 
}
df_all_means <- data.frame(temp_all_mean)
values_diff <- numeric(0)
groups <- numeric(0)
for (j in c(1:length(major_categories))){
    temp_mean <- rep(0, nsims)
    temp_diff <- rep(0, nsims)
    category = major_categories[j]
    for (i in (1:nsims)) {
      temp_mean[i] = mean(sample(df_all[df_all$Major_category == category, ]$Median, nsize[[j]], replace = TRUE)) 
      temp_diff[i] = temp_mean[i] - temp_all_mean[i]
    }
   df_means <- c(category, data.frame(temp_mean))
   values_diff <- c(values_diff, temp_diff)
   groups <- c(groups, rep(category, nsims))
   conf_int <- qdata(temp_mean, c(0.025, 0.975), data=df_means)
   conf_int_diff <- qdata(temp_diff, c(0.025, 0.975), data.frame(temp_diff))
   real_mean <- mean(temp_mean)
   df_bootstrap[nrow(df_bootstrap) + 1, ] <- c(category, real_mean, conf_int[[1]], conf_int[[2]])
   df_bootstrap_diff[nrow(df_bootstrap_diff) + 1, ] <- c(category, conf_int_diff[[1]], conf_int_diff[[2]])

}
df_diff <- data.frame(values_diff, groups)
print('Bootstrap confidence intervals for the median salary')
## [1] "Bootstrap confidence intervals for the median salary"
df_bootstrap
##                         Major_category             Mean
## 1      Agriculture & Natural Resources         55021.85
## 2               Biology & Life Science 50859.3571428571
## 3                          Engineering 77784.3620689655
## 4            Humanities & Liberal Arts 46080.4033333333
## 5          Communications & Journalism            49511
## 6              Computers & Mathematics 66208.7272727273
## 7  Industrial Arts & Consumer Services 52509.7857142857
## 8                            Education     43807.165625
## 9                  Law & Public Policy          52754.2
## 10                   Interdisciplinary        20795.946
## 11                              Health 56587.8541666667
## 12                      Social Science 53156.6666666667
## 13                   Physical Sciences         62344.35
## 14            Psychology & Social Work 44597.1111111111
## 15                                Arts       43555.0625
## 16                            Business 60563.5769230769
##    Confidence Interval Left Confidence Interval Right
## 1                   51597.5                     58600
## 2          47391.9642857143          53786.6071428571
## 3          73239.6551724138          82897.4137931034
## 4          44446.6666666667          47680.3333333333
## 5                     48500                     50000
## 6          60272.7272727273          73002.2727272727
## 7          44355.3571428571          61430.3571428571
## 8                41612.1875               46456.40625
## 9                     49200                     57200
## 10                   1107.6                 41943.325
## 11         49458.3333333333          67005.2083333333
## 12         49441.6666666667          58111.1111111111
## 13                  58397.5                   67102.5
## 14         40888.8888888889          49555.5555555556
## 15                    41225                     45525
## 16         56692.3076923077          64384.6153846154
#png("bootstrap_salary.png", height = 50*nrow(df_bootstrap), width = 200*ncol(df_bootstrap))
#grid.table(df_bootstrap)
#dev.off()
ggplot(data=df_all, aes(x = Median, y=Major_category, color = Major_category)) + geom_boxplot() + theme(text = element_text(size = 20)) 

#+ stat_summary(fun.data = mean_cl_boot, geom = "errorbar",  width=0.5, linewidth=1)
ggsave('median_salary.png')
## Saving 15 x 6 in image
p1 <- ggplot(df_diff[df_diff$groups == major_categories[1], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#00BFC4", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[1]) + theme(text = element_text(size = 12))  
p2 <- ggplot(df_diff[df_diff$groups == major_categories[2], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#619CFF", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[2])+ theme(text = element_text(size = 12))  
p3 <- ggplot(df_diff[df_diff$groups == major_categories[3], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#C77CFF", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[3])+ theme(text = element_text(size = 12))  
p4 <- ggplot(df_diff[df_diff$groups == major_categories[4], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "#E76BF3", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[4])+ theme(text = element_text(size = 12))  
p5 <- ggplot(df_diff[df_diff$groups == major_categories[5], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "yellow", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[5])+ theme(text = element_text(size = 12))  
p6 <- ggplot(df_diff[df_diff$groups == major_categories[6], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "blue", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[6])+ theme(text = element_text(size = 12))  
p7 <- ggplot(df_diff[df_diff$groups == major_categories[7], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "red", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[7])+ theme(text = element_text(size = 12))  
p8 <- ggplot(df_diff[df_diff$groups == major_categories[8], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "pink", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[8])+ theme(text = element_text(size = 12))  
p9 <- ggplot(df_diff[df_diff$groups == major_categories[9], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "green", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[9])+ theme(text = element_text(size = 12))  
p10 <- ggplot(df_diff[df_diff$groups == major_categories[10], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "purple", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[10])+ theme(text = element_text(size = 12))  
p11 <- ggplot(df_diff[df_diff$groups == major_categories[11], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "black", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[11])+ theme(text = element_text(size = 12))  
p12 <- ggplot(df_diff[df_diff$groups == major_categories[12], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "darkgrey", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[12])+ theme(text = element_text(size = 12))  
p13 <- ggplot(df_diff[df_diff$groups == major_categories[13], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "darkgreen", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[13])+ theme(text = element_text(size = 12))  
p14 <- ggplot(df_diff[df_diff$groups == major_categories[14], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "magenta", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[14])+ theme(text = element_text(size = 12))  
p15 <- ggplot(df_diff[df_diff$groups == major_categories[15], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "cyan", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[15])+ theme(text = element_text(size = 12))  
p16 <- ggplot(df_diff[df_diff$groups == major_categories[16], ], aes(x = values_diff, fill = groups)) + geom_histogram(fill = "brown", position = "identity", alpha = 0.2, bins = 50) + xlab(major_categories[16])+ theme(text = element_text(size = 12))  
grid.arrange(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 4)

g <- arrangeGrob(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 8)
ggsave(file = "diff2.png", g)
## Saving 20 x 6 in image
pvalue = numeric(16)
for (j in c(1:length(major_categories))){
  category = major_categories[j]
  pvalue[j] = t.test(df_diff[df_diff$groups == category,]$values_diff, mu=0, alternative="two.sided", data=df_diff)[[3]]
}

df_bootstrap_diff$`P-value` <- pvalue
head(df_bootstrap_diff, n = 16)
##                         Major_category      Diff_CI_Left     Diff_CI_Right
## 1      Agriculture & Natural Resources -6054.76878612717  2529.01734104046
## 2               Biology & Life Science -10022.2285301404 -2162.97584640793
## 3                          Engineering  16007.5169423958  26584.8963524018
## 4            Humanities & Liberal Arts -13614.1290944123 -8014.97398843931
## 5          Communications & Journalism -9711.04046242775 -5033.39595375723
## 6              Computers & Mathematics  3117.75486074619  16509.9829217026
## 7  Industrial Arts & Consumer Services -12775.3179190751  4762.33278282411
## 8                            Education -16232.5045158959  -9596.3674132948
## 9                  Law & Public Policy -8397.76011560694  696.618497109828
## 10                   Interdisciplinary -55793.4947976879 -14904.8248554913
## 11                              Health -7782.91064547206  10664.4159441233
## 12                      Social Science    -8185.15253693  1825.24727039176
## 13                   Physical Sciences  1063.88728323699  11022.4855491329
## 14            Psychology & Social Work -16662.6236351959 -6950.79158638408
## 15                                Arts -16403.1828034682 -10134.7976878613
## 16                            Business -557.226545131176  8155.68252556692
##          P-value
## 1  4.758396e-226
## 2   0.000000e+00
## 3   0.000000e+00
## 4   0.000000e+00
## 5   0.000000e+00
## 6   0.000000e+00
## 7  6.163206e-286
## 8   0.000000e+00
## 9   0.000000e+00
## 10  0.000000e+00
## 11  5.426102e-02
## 12  0.000000e+00
## 13  0.000000e+00
## 14  0.000000e+00
## 15  0.000000e+00
## 16  0.000000e+00
#png("bootstrap_salary_diff.png", height = 50*nrow(df_bootstrap_diff), width = 200*ncol(df_bootstrap_diff))
#grid.table(df_bootstrap_diff)
#dev.off()
p1 <- ggplot(df_diff[df_diff$groups == major_categories[1], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[1]) + theme(text = element_text(size = 16))
p2 <- ggplot(df_diff[df_diff$groups == major_categories[2], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[2]) + theme(text = element_text(size = 16)) 
p3 <- ggplot(df_diff[df_diff$groups == major_categories[3], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[3]) + theme(text = element_text(size = 16))
p4 <- ggplot(df_diff[df_diff$groups == major_categories[4], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[4]) + theme(text = element_text(size = 16))
p5 <- ggplot(df_diff[df_diff$groups == major_categories[5], ],aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[5]) + theme(text = element_text(size = 16))  
p6 <- ggplot(df_diff[df_diff$groups == major_categories[6], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[6]) + theme(text = element_text(size = 16))
p7 <- ggplot(df_diff[df_diff$groups == major_categories[7], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[7]) + theme(text = element_text(size = 16))  
p8 <- ggplot(df_diff[df_diff$groups == major_categories[8], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[8]) + theme(text = element_text(size = 16))  
p9 <- ggplot(df_diff[df_diff$groups == major_categories[9], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[9]) + theme(text = element_text(size = 16))  
p10 <- ggplot(df_diff[df_diff$groups == major_categories[10], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[10]) + theme(text = element_text(size = 16))  
p11 <- ggplot(df_diff[df_diff$groups == major_categories[11], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[11]) + theme(text = element_text(size = 16))
p12 <- ggplot(df_diff[df_diff$groups == major_categories[12], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[12]) + theme(text = element_text(size = 16)) 
p13 <- ggplot(df_diff[df_diff$groups == major_categories[13], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[13]) + theme(text = element_text(size = 16)) 
p14 <- ggplot(df_diff[df_diff$groups == major_categories[14], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[14]) + theme(text = element_text(size = 16)) 
p15 <- ggplot(df_diff[df_diff$groups == major_categories[15], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[15]) + theme(text = element_text(size = 16))  
p16 <- ggplot(df_diff[df_diff$groups == major_categories[16], ], aes(sample = values_diff)) + stat_qq(col='blue')  +   stat_qq_line(col='red') +  xlab(major_categories[16]) + theme(text = element_text(size = 16))  
grid.arrange(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 4,
             top = textGrob("Normal Probability Plots",gp=gpar(fontsize=26,font=1)))

g <- arrangeGrob(p1, p2, p3, p4,
             p5, p6, p7, p8,
             p9, p10, p11, p12,
             p13, p14, p15, p16, ncol = 8, 
              top = textGrob("Normal Probability Plots",gp=gpar(fontsize=26,font=1)))
ggsave(file = "qqplot2.png", g)
## Saving 10 x 10 in image

Table of correlation between female proportion and median salary in each category

women_cat <- unique(df_women$Major_category)
df_correlation = data.frame(matrix(ncol = 2, nrow = 0))
colnames(df_correlation) = c('Major category', 'Correlation coefficient')
for (category in women_cat){
  df_women_cat <- df_women[df_women$Major_category == category,]
  correlation = cor(df_women_cat$Median, df_women_cat$ShareWomen)
  df_correlation[nrow(df_correlation) + 1,] <- c(category, correlation)
}
print(paste('Coefficient of correlation using data for all categories =',cor(df_women$Median, df_women$ShareWomen)))
## [1] "Coefficient of correlation using data for all categories = -0.614711476104321"

Coefficient of correlation between proportion of women in major and median salary by Major group:

df_correlation
##                         Major category Correlation coefficient
## 1                          Engineering        -0.3229433388508
## 2                             Business      -0.314249837263197
## 3                    Physical Sciences      -0.109029687176233
## 4                  Law & Public Policy      -0.399546665435339
## 5              Computers & Mathematics      0.0680772399912517
## 6      Agriculture & Natural Resources      -0.880237415815281
## 7  Industrial Arts & Consumer Services      -0.502175035056826
## 8                                 Arts      -0.600657937316326
## 9                               Health      -0.250541571126713
## 10                      Social Science      -0.594340180180328
## 11              Biology & Life Science      -0.180365846054399
## 12                           Education      -0.475942244966131
## 13           Humanities & Liberal Arts      -0.267407718223315
## 14            Psychology & Social Work      -0.677393068439678
## 15         Communications & Journalism      -0.642674070247747
## 16                   Interdisciplinary                    <NA>
ggplot(df_women, aes(x = ShareWomen, y = Median, col = Major_category)) + geom_point(size = 5, position="jitter") + xlab("Proportion of women") + ylab("Median Salary") + ggtitle('Median salary to Proportion of women by major') + geom_smooth(method="lm", col="red") + theme(text = element_text(size = 20))+scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'

ggsave('salary_correlation.png')
## Saving 10 x 6 in image
## `geom_smooth()` using formula = 'y ~ x'

Fitting linear model for the relationship between median salary and proportion of women

predict_model = lm(Median ~ ShareWomen, data = df_women)
predict_model$coef
## (Intercept)  ShareWomen 
##    56130.94   -30579.83

We can predict \(S_{Median} = 56130.94 - 30579.83 * P_{Women}\), where S - salary, P - proportion Coefficient of correlation is -0.6147

This model should follow two conditions to be accepted: 1. Normal distribution of variable. For checking this condition we have to calculate residuals between each value and predicted model and use Normal probability plot for residuals to check the normality. 2. Homoscedasticity of distribution - For each distinct value of the x-variable (the predictor variable), the y variable has the same standard deviation \(\sigma\)

fv = predict_model$fitted.values
res = predict_model$residuals
diagnosticdf = data.frame(fv, res)
ggplot(diagnosticdf, aes(sample = res)) + stat_qq(col='blue')  +  stat_qq_line(col='red') + theme(text = element_text(size = 16)) + ggtitle("Normal Probability Plot")

ggsave('qqplot3.png')
## Saving 7 x 5 in image

To predict a homoscedasticity we plot fitted values to residuals

ggplot(diagnosticdf, aes(x = fv, y = res)) +  geom_point(size=2, col='blue', position="jitter") + xlab("Predicted Median Salary") + ylab("Residuals") + ggtitle("Plot of Fitts to Residuals") + geom_hline(yintercept=0, color="red", linetype="dashed") + theme(text = element_text(size = 16))

ggsave('homoscedasticity.png')
## Saving 7 x 5 in image